*_____________________________________________________________________________________________________________________________________________________
*
**# OVERALL PLAN
*_____________________________________________________________________________________________________________________________________________________

/*	Note that each section explains what is going on in a little bit more detail:
	GLOBALS & FILE LOCATIONS 				Where to find datafiles, plus other locations and settings
	PRELIMINARIES							Renaming variables to lowercase, + running the RAND harmonisation files
	CLEANING								My further data cleaning
	QUICK CHECKS 							Misc things - missing variable count / prevalence of each individual dis measure / main analyses in test formation

*/
	
*_____________________________________________________________________________________________________________________________________________________
*
**# GLOBALS AND FILE LOCATIONS
*_____________________________________________________________________________________________________________________________________________________

capture cd "C:\Users\k2256879"
	if _rc==0	global user 	"C:\Users\k2256879"
capture cd "C:\Users\benba"
	if _rc==0	global user 	"C:\Users\benba"
// initialising, which clears everything, and I want to be able to re-run the globals without doing this!				
do "${user}\OneDrive - King's College London\Personal\ADO files\StataStartup.do"
do "${user}\OneDrive - King's College London\Disability work\ESRC Future Leaders Disability\Phase 1 (Dis Emp Rates) - Intl\ELSA-SHARE-HRS\0_initialising.do" 
// globals
do "${user}\OneDrive - King's College London\Disability work\ESRC Future Leaders Disability\Phase 1 (Dis Emp Rates) - Intl\ELSA-SHARE-HRS\0_globals.do"				
dis in red "File location and variable list globals are loaded"
exit


*_____________________________________________________________________________________________________________________________________________________
*
**# PRELIMINARIES - to be done once
*_____________________________________________________________________________________________________________________________________________________

pause Only needs running once
	
// MAKING ALL DATASETS LOWERCASE - for ELSA, this is in new files as otherwise the SHARE syntax doesn't work
	forvalues i = 1/7 {
		local origfile = subinstr("${elsaw`i'}", "_lower.dta", ".dta", .)		// get the filename without '_lower'
		use "`origfile'", replace												// use the original file
		renvars, lower															// make it lowercase
		save "${elsaw`i'}", replace												// save it with '_lower', for main use
	/**/ }
	* And the same for the ELSA nurse visit datasets:
	foreach i in 2 4 6 {
		local origfile = subinstr("${elsaw`i'_nurse}", "_lower.dta", ".dta", .)
		use "`origfile'", replace
		renvars, lower
		save "${elsaw`i'_nurse}", replace
	/**/ }
	use "${hrsdir}\RAND fat files\hd10f5c.dta"
	renvars, lower
	save, replace
	
// RUNNING RAND HARMONISATION FILES, WHICH I THEN ADD FURTHER VARIABLES TO IN THE CLEANING .DO FILES BELOW
	do "${dodir}\Data cleaning\1_harmonised_ELSA_master.do"
	do "${dodir}\Data cleaning\1_H_SHARE_long_vD.do"					// from 13/11/2017, was RAND file version D
	do "${dodir}\Data cleaning\2_H_SHARE_v5_master.do"					// SHARE v5 - this is neceessary for some variables that BBG/MvdH were unable to make work with the revised imputations



*_____________________________________________________________________________________________________________________________________________________
*
**# CLEANING - to be done each time the dataset is changed - 5akes about 5mins to run
*_____________________________________________________________________________________________________________________________________________________

/* EXPLANATION OF THIS SECTION:
	This runs 1_cleaning_HRS / _SHARE / _ELSA, to clean each file.
	Then appends them together and does further cleaning (2_cleaningXsurvey_labelling.do), and sets up weighting.
	Adds retirement age (2_ret_age.do) - see also the Boheim-Leoni paper fn2 and Appendix 3.
*/

// Individual files
do "${dodir}\1_cleaning_HRS.do"					// 1min 
do "${dodir}\1_cleaning_SHARE.do"				// 2.5mins
do "${dodir}\1_cleaning_ELSA.do"				// 20secs


// Further cleaning - takes about 40secs
* Merging
	use "${hrsdir}\HRS_BBG_${versno}.dta", replace
	append using "${elsadir}\ELSA_BB_${versno}.dta"
	append using "${sharedir}\SHARE_BB_${versno}.dta"
* Cleaning the merged file
	do "${dodir}\2_cleaningXsurvey_labelling.do"	// 75secs
		save "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}.dta", replace
	do "${dodir}\2_ret_age.do"
	svyset [pweight=rwtresp2], strata(survey_waveN)	// 3 secs - note this was previouly done just below for my own file, I think Boheim-Leoni do something different
save "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}.dta", replace		
		


*_____________________________________________________________________________________________________________________________________________________
*
**# MAIN RESULTS (prefix '5')
*_____________________________________________________________________________________________________________________________________________________

/* 	MEASURES: List of disability measures is in the 0_globals.do file, to make sure that always loaded by pressing Ctrl-D */

	
////////////////////////////////////////////////////////
// Disability and replicate weights 
////////////////////////////////////////////////////////

* Disability weights
use "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}.dta", replace
	global agerange			"(ragey>=50 & ragey<70)"		
	global countrywaves		"(year==2015 & inlist(country,19,23,29,34,35)) | (year==2013 & !inlist(country,19,23,34,35)) | (country==61 & year==2014) | (country==51 & year==2010)" // Poland (#29, 2013) just has too small a sample
	global whichversions	"IRT H allIRT L predfxd H"
do "${dodir}\4_dis_weights.do"				// Creates the latent disability variable based on the globals above
	estpost tab country_waveN	// Sample size for each country-wave in this run (there's an alternative under 'Alternatives')
		esttab using "${dodir}\Outputs\n_samplesizes.rtf", replace cells("b(label(freq))") varlabels(, blist(Total "{hline @width}{break}")) nonumber nomtitle noobs varwidth(25)
	save "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}_withdisweights.dta", replace


	
////////////////////////////////////////////////////////
// Results
////////////////////////////////////////////////////////

// To run before any of the results below, even if being run separately
do "${user}\OneDrive - King's College London\Disability work\ESRC Future Leaders Disability\Phase 1 (Dis Emp Rates) - Intl\ELSA-SHARE-HRS\0_globals.do"				
use "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}_withdisweights.dta", replace
* Run every time
global controlsmeans ""					// Calculating mean of controls2 variables - can't be done when setting the globals, as it depends on the analysis dataset
	capture svy: mean ${controls} ${controls2}		// now includes age, rather than European standard pop - see note in 0_globals.do
	if _rc==0	{							// ${controlsmeans} should be empty where ${controls2} is empty
		foreach word in `e(varlist)'	{
			local col = `col' + 1
			local working = e(b)[1, `col']
			global controlsmeans "${controlsmeans} `word'=`working'"
		/**/							}
	/**/		}
dis in red "Controls are ${controls} ${controls2} - their means in the atspec are ${controlsmeans"}
pause Check this is correct!


		// Non-bootstrapped results (excludes PPWD and has one-off version of pred_random)
		do "${dodir}\5_nonbootstrap.do"					// See notes at top of results section
		/* Further exploration in 5c_results_other.do: a separate file, for type of work (ft vs. pt) and disben claims */		
		
			
		// Main version using SIMULATE 
		matrix drop _all
		do "${dodir}\5_results_simulate.do"				// Just loading the bootstrap command; see notes at top of results section for further explanation
		svyset [pw=${pweight}], strata(country)
		global run = 				12					// Some of the dis weights outputs are run-specific, e.g. if we recalculate by gender
		global simreps 				"400"				// simulations for the full sample estimate before bootstrapping; Can't set bootstrap reps, because these are decided when you're creating the bootstrap replicate weights above
		simulate , reps(${simreps}) seed(123${run}456) saving("${workingdata}/justsim_run${run}.dta", replace)  /// 	
			/**/ :  disemp_sim ${countryvar} /*${if}*/ [pweight=${pweight}], distypes("dis predicted fixedpred irt") controls(${controls}) controls2(${controls2})			///
			/**/ 	empvar(${empvar}) disvar(${disvar}) irtvar(${IRTvar}) fixedpredvar(${predictedvar}_fxd) predictedvar(${predictedvar}) // allirtvar(${allIRTvar})  wgvar(${WGvar})  and taken out of distypes
		* Formatting the output to be the same as for bootstrap, and exporting it
		use "${workingdata}/justsim_run${run}.dta", replace
			unab allvars: _all
			local i = 1
			foreach var in `allvars'	{
				sum `var'
				if `i'==1		matrix output = 		`r(mean)'
				else			matrix output = output, `r(mean)' 
				local thislab: variable label `var'
				local thislab = subinstr(subinstr(subinstr("`thislab'", "]_b[", ":", .), "[", "", .), "]", "", .)
				if `i'==1		global rownames  = "`thislab'"
				else			global rownames  = "${rownames} `thislab'"
				local ++i
			/**/						}
			matrix colnames output = $rownames
			matrix output = output'
		esttab matrix(output) using "${dodir}\outputs\1_bootstrap_run${run}.csv", csv replace not nostar nonum nodepvars b(%5.4f) /// 	
			addnotes("Number of replications is ${simreps}, date outputted is `c(current_date)', subgroup is "${if}"")

			
		// Bootstrapped results (for PPWD, and an alternative version for doing probabilistic disability)
		do "${dodir}\5_results_bootstrap.do"		// Just loading the bootstrap command; see notes at top of results section for further explanation
		global run = 				1			// Some of the dis weights outputs are run-specific, e.g. if we recalculate by gender
		global reps					"400"				 	// replications for bootstrapping, both BOOTSTRAP & SVY BOOTSTRAP
		* The actual bootstrap command - for error-checking, add 'noisily' back in
			eststo bootstrap_all: bootstrap, nodrop reps(${reps}) seed(13062017) saving("${workingdata}/bs_run${run}.dta", replace) strata(${countryvar}) /// 	 bca noisily 
				/**/ :  disempBS ${countryvar} /*${if}*/, distypes("dis predicted fixedpred irt allirt") empvar(${empvar}) controls(${controls}) controls2(${controls2})			///
				/**/ 	disvar(${disvar}) irtvar(${IRTvar}) allirtvar(${allIRTvar}) fixedpredvar(${predictedvar}_fxd) predictedvar(${predictedvar})  // wgvar(${WGvar}) and taken out of distypes
			/* Running these results at a later date
			bstat using "${workingdata}/bs_run${run}.dta"
			*/
			estat bootstrap, bc			// Necessary to recover the main estimate			
			* Outputting the results
			esttab bootstrap_all using "${dodir}\outputs\1_bootstrap_run${run}.csv", csv replace not ci nostar nonum nodepvars b(%5.4f) /// 	
				cells("b ci_normal[ll] ci_percentile[ll] ci_bc[ll]" "bias ci_normal[ul] ci_percentile[ul] ci_bc[ul]")					///
				addnotes("Col 2 is normal-based CI, col3 is percentile-based CI, col4 is bias-corrected CI" 							///
						 "Number of replications is ${reps}, date outputted is `c(current_date)', subgroup is "${if}"")

	
		// Alternative bootstrapped syntax with weighted data (but results are not trustworthy)
		* Replicate weights
		svyset [pw=${pweight}], strata(country)
		bsweights bw, reps(400) average(10) n(-1) seed(10101) balanced dots replace
		save "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}_withBSweights.dta", replace
		* Loading the command
		do "${dodir}\5_results_weightedbootstrap.do"		
		svyset [pw=${pweight}], strata(country) vce(bootstrap) bsrweight(bw*) bsn(10)		// I think 'bsn' is the equivalent to 'vfactor' in the old command bs4rw
		global run = 				11					// Some of the dis weights outputs are run-specific, e.g. if we recalculate by gender
		global simreps 				"400"				// simulations for the full sample estimate before bootstrapping; Can't set bootstrap reps, because these are decided when you're creating the bootstrap replicate weights above
		* The actual bootstrap command - for error-checking, add 'noisily' back in
		svy bootstrap, bsn(10) nodrop saving("${workingdata}/bs_run${run}.dta", replace)  /// 	strata(${countryvar}) can only be applied to svyset// have also taken out noisily bca 
			/**/ :  disempBS_alternate ${countryvar} /*${if}*/, distypes("dis predicted fixedpred irt") empvar(${empvar}) controls(${controls}) controls2(${controls2})			///
			/**/ 	disvar(${disvar}) irtvar(${IRTvar}) fixedpredvar(${predictedvar}_fxd) predictedvar(${predictedvar})  // allirtvar(${allIRTvar})  wgvar(${WGvar}) and taken out of distypes 
		* Then putting boostrap together with simulate
			// CORRECT INITIAL SAMPLE ESTIMATES USING 100 SIMULATIONS
			use "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}_withdisweights.dta", replace
			global i = 1
			simulate , reps(${simreps}) seed(123${run}456) saving("${workingdata}/sim_run${run}.dta", replace)  /// 	
				/**/ :  disempBS ${countryvar} /*${if}*/, distypes("dis predicted fixedpred irt") empvar(${empvar}) controls(${controls}) controls2(${controls2})			///
				/**/ 	disvar(${disvar}) irtvar(${IRTvar}) fixedpredvar(${predictedvar}_fxd) predictedvar(${predictedvar}) // allirtvar(${allIRTvar})  wgvar(${WGvar})  and taken out of distypes
				* Correct output matrix format, to match bootstrap
				global run = 				11					// Some of the dis weights outputs are run-specific, e.g. if we recalculate by gender
				use "${workingdata}/sim_run${run}.dta", replace
				unab allvars: _all
				local i = 1
				foreach var in `allvars'	{
					sum `var'
					if `i'==1		matrix output = 		`r(mean)'
					else			matrix output = output, `r(mean)' 
					local thislab: variable label `var'
					local thislab = subinstr(subinstr(subinstr("`thislab'", "]_b[", ":", .), "[", "", .), "]", "", .)
					if `i'==1		global rownames  = "`thislab'"
					else			global rownames  = "${rownames} `thislab'"
					local ++i
				/**/						}
				matrix colnames output = $rownames
			// COMBINING WITH BOOTSTRAPPED DATA
			use 	"${workingdata}/bs_run${run}.dta", replace	
			char _dta[bs_version] 3				// required to get the results labelled correctly - see the PDF manual entry for BSTAT
			bstat in 1/L, stat(output)			// I don't know why the 'in 1/L' is required to get it to look at all reps, but it works! https://www.nber.org/stata/efficient/bootstrap.html
			estimates store bootstrap_alternate // you can't reun 'estat bootstrap, all' after svy bootstrap
			esttab using "${dodir}\outputs\1_bootstrap_run${run}.csv", csv replace not ci nostar nonum nodepvars b(%5.4f) 		/// 	
				cells("b ci_normal[ll] ci_percentile[ll] ci_bc[ll]" "bias ci_normal[ul] ci_percentile[ul] ci_bc[ul]")			///
				addnotes("Col 2 is normal-based CI, col3 is percentile-based CI, col4 is bias-corrected CI" 					///
						 "Number of replications is ${reps}, date outputted is `c(current_date)', subgroup is "${if}"")

						 
*_____________________________________________________________________________________________________________________________________________________
*
**# ELSA-SHARE vs. EHIS data prep
*_____________________________________________________________________________________________________________________________________________________

use "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}.dta", replace
	global agerange			"(ragey>=50 & ragey<70)"		// used in dis_weights.do
	global countrywaves		"(year==2015 & inlist(country,19,23,29,34,35)) | (year==2013 & !inlist(country,19,23,34,35)) | (country==61 & year==2014) | (country==51 & year==2010)" // Poland (#29, 2013) just has too small a sample
	global whichversions	"IRT L allIRT H predfxd L"
do "${dodir}\4_dis_weights.do"				// Creates the latent disability variable based on the globals above
save "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}_forEHIS.dta", replace
use "${workingdata}\HRS-SHARE-ELSA_BBG_${versno}_forEHIS.dta", replace



